This work has been published in Cancer Research Communications journal in April 2023. Available: https://doi.org/10.1158/2767-9764.CRC-22-0385

About Dataset

The original dataset used in this project was provided by BC Cancer Gastrointestinal Cancer Outcomes Unit (GICOU) and BC Vital Statistics. It contains a total of 20136 colorectal cancer cases received radiotherapy in BC between 2002 and 2016. All personal identifiers (agency ID, PHN, and DOB, etc.) were removed before saving the data into a csv file.

Load Packages

library(knitr)
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, dpi = 100)
library(kableExtra)
library(tidyverse)
options(dplyr.summarise.inform = FALSE)
library(xlsx)
library(binom)
library(ggrepel)
library(ggalluvial)
library(survival)
library(survminer)

Data Import

data_raw <- read_csv("EOcrc_final_GI_RT_prot_w_Chemo_RTvars_20211123_personal_info_removed.csv",
                          show_col_types = FALSE)

Data Cleaning

  • Select columns: A total of 285 columns were identified in the original data but only 91 were selected for further analysis.
  • Assign age groups in two different ways:
    • <=29, 30-39, 40-49 or >=50
    • <50 or >=50
  • Determine geographic location: rural or urban?
    • Parse the second digit of the postal code
    • “0” - rural, other numbers - urban.
  • Convert cT, cN, pT, pN into numbers.
    • Use neoadjuvant as flag for any further analysis.
  • Replace unknown information with NA.
  • Convert date entries into proper date format.
data_cleaned <- data_raw %>% 
  select(diagnosis_date, site_admit_date, age_at_diagnosis, loc_at_diag, sex,
         performance_status, site, behavior, hist1, final_grade, tumour_subgroup,
         cT, cN, overall_clin_stg, pT, pN, m_stg, overall_path_stg, overall_stg,
         neoadjuvant, M1_at_Dx, COL_tum_size, final_tot_nodes, final_pos_nodes,
         final_MSI, cr_anal_verge_dist_onco, cr_anal_verge_dist_status_onco,
         final_CEA, cr_prim_site_p00020, final_margin_status, GI_RT_protcode1,
         cr_rt_prim_onco, cr_rt_prim_p00020, cr_system_tx_dt_onco,
         cr_system_tx_dt_fst_p00020, cr_system_tx_dt_stat_onco, cr_system_tx_onco,
         cr_system_tx_p00020, cr_system_tx_type_fst_p00020, surg_treat_date1,
         surg_treat_date2, surg_treat_date3, surg_treat_date4, RT_start_date1,
         RT_end_date1, RT_start_date2, RT_end_date2, RT_start_date3, RT_end_date3,
         RT_start_date4, RT_end_date4, RT_start_date5, RT_end_date5, RT_start_date6,
         RT_end_date6, m1dtofdx, m1site, fst_relapse_date, fst_reg_dist_date,
         LOCIND, LOCDATE, LOCSITE, REGIND, REGDATE, REGSITE, DISTIND, DISTDATE, DISTSITE,
         subseq_colorectalca, subseq_CRC_dx_date, pat_status, death_date,
         FINAL_last_contact_date, survyrs, colorectaldeath, colorectaldthdat,
         loctodth, regtodth, distant_date, anyreldt, anyenddt, anysurv, anystat,
         locregdate, locsurv, locstat,locenddt, min_mets_date,
         survyrs_from_dist_mets, relapse_to_lcd, dx_to_final_lcd) %>% 
  mutate(
    ## Assign age groups
    age_group1 = cut(age_at_diagnosis, breaks = c(0, 49, Inf),
                     labels = c("<50", ">=50")) %>% 
      factor(),
    age_group2 = cut(age_at_diagnosis, breaks = c(0, 29, 39, 49, Inf),
                     labels = c("<=29", "30-39", "40-49", ">=50")) %>% 
      factor(),
    
    ## remove unknown status (mostly number 8 or 9)
    across(c(performance_status, final_grade, overall_path_stg,
             overall_clin_stg, final_margin_status, colorectaldeath),
           ~ifelse(.x <= 4, .x, NA)),
    final_MSI = ifelse(final_MSI %in% c(7, 8), NA, final_MSI),
    
    ## convert T and N stage info into numbers
    across(c(cT, cN, pT, pN), parse_number),
    
    ## remove unknown node info (numbers > 90)
    across(c(final_tot_nodes, final_pos_nodes), ~ifelse(.x > 90, NA, .x)),
    
    ## remove unknown CEA info (numbers > 988), convert 980 to 98 (>98 category)
    final_CEA = ifelse(final_CEA > 988, NA, ifelse(final_CEA == 980, 98, final_CEA)),
    
    ## remove unknown margin status
    final_margin_status = ifelse(final_margin_status > 2, NA, final_margin_status),
    
    ## convert into date format
    across(c(diagnosis_date, site_admit_date, FINAL_last_contact_date,
             colorectaldthdat, distant_date, anyreldt, anyenddt, 
             locregdate, locenddt, min_mets_date),
           ~as.Date(.x, "%d-%b-%y")),
    across(c(cr_system_tx_dt_onco, surg_treat_date1, surg_treat_date2,
             surg_treat_date3, surg_treat_date4, RT_start_date1, RT_end_date1,
             RT_start_date2, RT_end_date2, RT_start_date3, RT_end_date3,
             RT_start_date4, RT_end_date4, RT_start_date5, RT_end_date5,
             RT_start_date6, RT_end_date6, m1dtofdx, fst_relapse_date,
             fst_reg_dist_date, LOCDATE, REGDATE, DISTDATE, subseq_CRC_dx_date,
             death_date),
           ~as.Date(.x, "%m/%d/%Y")))

Chart Review and Data Correction

Any data entry errors found throughout the entire analysis were corrected in this step. (Codes are hidden here.)

Filtering and Calculation

  1. Select all rectal cancer cases:
    • Select site == "C209" (site_desc == "Rectum, NOS") to select all rectal cancer cases.
    • DO NOT use tumor_subgroup == "RE" as it contains non-rectal cancer cases.
    • Exclude 1 case of hist1 == 80453 (hist1_desc == "Combined small cell carcinoma") and 9 cases of hist1 == 81482 (hist1_desc == "Glandular intraepithelial heoplasia, grade III).
    • Exclude 2 cases of cr_prim_site_p00020 == "B" (cr_prim_site_p00020 == "Rectosigmoid Junction").
  2. Merge tumor location columns cr_prim_site_p00020 and cr_anal_verge_dist_onco.
    • Create two new columns dist_merged and loc_merged.
    • Convert the group letters into averaged distances in dist_merged, e.g. "Rectum, upper (11-15 cm)"13, "Rectum, mid (5-10.9 cm)"8.
    • Convert the distances into group letters in loc_merged.
  3. Calculate tumor stage changes.
  4. Calculate each RT duration.
  5. Merge RT information columns cr_rt_prim_onco and cr_rt_prim_p00020.
    • Create rt_merged column
    • Select rt_merged values 1 (pre-op short) and 2 (pre-op long), categorize into long course group and short course group
    • Check rt_merged values 5, 77, 88 whether any cases qualify for pre-op RT (compare surg_treat_date1 and RT_start_date1) → result: none qualified
  6. Categorize multiple parameters
data_filtered <- data_cleaned %>% 
  ## select all rectal cancer cases
  filter(site  == "C209", hist1 != 80453, hist1 != 81482,
         (cr_prim_site_p00020 != "B") |
           is.na(cr_prim_site_p00020),
         (cr_anal_verge_dist_onco <= 15) |
           (cr_anal_verge_dist_onco %in% c(89, 99)) |
           is.na(cr_anal_verge_dist_onco)) %>% 
  
  ## merge distance/location info for further processing
  unite("dist_merged", c(cr_prim_site_p00020, cr_anal_verge_dist_onco),
        na.rm = TRUE, remove = FALSE) %>%
  
  ## merge RT info for further processing
  unite("rt_merged", c(cr_rt_prim_onco, cr_rt_prim_p00020),
        na.rm = TRUE, remove = FALSE) %>% 
  
  mutate(
    ## relabel sex
    sex = factor(sex, levels = c("F", "M"), labels = c("Female", "Male")),
    
    ## determine rural/urban
    rural_urban = ifelse(is.na(loc_at_diag), "Unknown",
                         ifelse(grepl(".0....", loc_at_diag), "Rural", "Urban")) %>% 
      factor(levels = c("Rural", "Urban", "Unknown")),
    
    ## convert letter into avg distance
    dist_merged = case_when(dist_merged == "C" ~ 13, # rectum, upper (11-15 cm)
                            dist_merged == "D" ~ 8, # rectum, mid (5-10.9 cm)
                            dist_merged == "E" ~ 2.5, # rectum, distal (<5 cm)
                            TRUE ~ as.numeric(dist_merged)),
    
    ## convert distance to letter
    loc_merged = case_when(cr_prim_site_p00020 == "C" ~ "Proximal (11-15 cm)",
                           cr_prim_site_p00020 == "D" ~ "Mid (5-10.9 cm)",
                           cr_prim_site_p00020 == "E" ~ "Distal (<5 cm)",
                           # cr_prim_site_p00020 == "F" ~ "NOS",
                           is.na(cr_prim_site_p00020) &
                             ((cr_anal_verge_dist_onco >= 11) &
                                (cr_anal_verge_dist_onco <= 15)) ~
                             "Proximal (11-15 cm)",
                           is.na(cr_prim_site_p00020) &
                             ((cr_anal_verge_dist_onco >= 5) &
                                (cr_anal_verge_dist_onco < 11)) ~
                             "Mid (5-10.9 cm)",
                           is.na(cr_prim_site_p00020) &
                             (cr_anal_verge_dist_onco < 5) ~
                             "Distal (<5 cm)") %>%
                           # TRUE ~ "NOS") %>% 
      factor(levels = c("Distal (<5 cm)", "Mid (5-10.9 cm)",
                        "Proximal (11-15 cm)")),
    
    ## pCR status
    pCR = case_when(((pT == 0) & (pN == 0)) | (overall_path_stg == 0) ~ "Yes",
                    !is.na(pT) & !is.na(pN) ~ "No") %>% 
      factor(levels = c("No", "Yes")),
    pT = case_when(pCR == "Yes" ~ 0,
                   TRUE ~ pT),
    pN = case_when(pCR == "Yes" ~ 0,
                   TRUE ~ pN),
    overall_path_stg = case_when(pCR == "Yes" ~ 0,
                                 TRUE ~ overall_path_stg),
    
    ## calculate changes in cancer stages
    T_stg_change = pT - cT,
    T_stg_change_status = case_when(T_stg_change < 0 ~ "Downstage",
                                    T_stg_change == 0 ~ "No change",
                                    T_stg_change > 0 ~ "Upstage") %>% 
      factor(),
    N_stg_change = pN - cN,
    N_stg_change_status = case_when(N_stg_change < 0 ~ "Downstage",
                                    N_stg_change == 0 ~ "No change",
                                    N_stg_change > 0 ~ "Upstage") %>% 
      factor(),
    overall_stg_change = overall_path_stg - overall_clin_stg,
    overall_stg_change_status = case_when(overall_stg_change < 0 ~ "Downstage",
                                          overall_stg_change == 0 ~ "No change",
                                          overall_stg_change > 0 ~ "Upstage") %>% 
      factor(),
    
    ## pCR status
    pCR = case_when((pT == 0) & (pN == 0) ~ "Yes",
                    !is.na(pT) & !is.na(pN) ~ "No") %>% 
      factor(levels = c("No", "Yes")),
    
    ## calculate RT durations
    RT_duration1 = RT_end_date1 - RT_start_date1,
    RT_duration2 = RT_end_date2 - RT_start_date2,
    RT_duration3 = RT_end_date3 - RT_start_date3,
    RT_duration4 = RT_end_date4 - RT_start_date4,
    RT_duration5 = RT_end_date5 - RT_start_date5,
    RT_duration6 = RT_end_date6 - RT_start_date6,
    
    ## identify pre-op RT and categorize into short and long course groups
    rt_merged = as.integer(rt_merged),
    rt_course = case_when(rt_merged == 1 ~ "Short-course RT",
                          (rt_merged == 2) & (is.na(GI_RT_protcode1)) ~ "Long-course RT",
                          (rt_merged == 2) & (!is.na(GI_RT_protcode1)) ~ "Long-course chemoRT") %>% 
      factor(levels = c("Short-course RT", "Long-course RT", "Long-course chemoRT")),
    
    ## categorize tumor grade
    grade = case_when(final_grade %in% c(1, 2) ~ "Well-Differentiated (Grades 1 and 2)",
                      final_grade %in% c(3, 4) ~ "Poorly Differentiated (Grades 3 and 4)",
                      final_grade == 0 ~ "Grade 0") %>% 
      factor(levels = c("Grade 0", "Well-Differentiated (Grades 1 and 2)",
                        "Poorly Differentiated (Grades 3 and 4)")),
    
    ## categorize cancer types
    histology = case_when(hist1 %in% c(80103, 80203, 80463, 81402, 81403, 81443,
                                       82102, 82103, 82113, 82203, 82213, 82553,
                                       82612, 82613, 82632, 82633) ~ "Adenocarcinoma",
                          hist1 %in% c(84803, 84813, 84903) ~ "Mucinous/Signet",
                          TRUE ~ "Other") %>%
      factor(),
    
    ## categorize MSI
    MSI_group = case_when(final_MSI %in% c(1, 2) ~ "MSI stable",
                          final_MSI %in% c(3, 9) ~ "MSI unstable, NOS; positive, NOS") %>% 
      factor(),
    
    ## categorize margin status
    margin = case_when(final_margin_status == 0 ~ "Microscopic Margins Clear",
                       final_margin_status == 1 ~ "Microscopic Margins positive (<=1mm)",
                       final_margin_status == 2 ~ "Macroscopic Residual Cancer, grossly positive") %>% 
      factor(levels = c("Microscopic Margins Clear", "Microscopic Margins positive (<=1mm)",
                        "Macroscopic Residual Cancer, grossly positive")),
    
    ## categorize node status
    node_total = case_when(final_tot_nodes >= 12 ~ ">=12",
                           final_tot_nodes < 12 ~ "<12") %>% 
      factor(),
    node_positive = case_when(final_pos_nodes == 0 ~ "0",
                              final_pos_nodes %in% c(1, 2) ~ "1-2",
                              final_pos_nodes %in% c(3, 4) ~ "3-4",
                              (final_pos_nodes >= 5) & (final_pos_nodes <= 9) ~ "5-9",
                              final_pos_nodes >= 10 ~ "10+") %>% 
      factor(levels = c("0", "1-2", "3-4", "5-9", "10+")),
    
    ## overall survival status
    surv_status = case_when(pat_status == "A" ~ 0,
                            pat_status == "D" ~ 1),
    
    ## disease specific status
    crc_death_status = case_when(colorectaldeath == 1 ~ 1,
                                 TRUE ~ 0),
    
    ## disease free status
    recurr_status = case_when(LOCIND | REGIND | DISTIND |
                                subseq_colorectalca | surv_status ~ 1,
                              TRUE ~ 0),
    dis_free_yrs = pmin(as.numeric((pmin(LOCDATE, REGDATE, DISTDATE, subseq_CRC_dx_date, death_date,
                                         na.rm = TRUE)-diagnosis_date)/365.25),
                        survyrs, na.rm = TRUE),
    
    ## disease free status with non CRC censored
    crc_recurr_status = case_when(LOCIND | REGIND | DISTIND |
                                    subseq_colorectalca | crc_death_status ~ 1,
                                  TRUE ~ 0),
    crc_recurr_free_yrs = pmin(as.numeric((pmin(LOCDATE, REGDATE, DISTDATE, subseq_CRC_dx_date, colorectaldthdat,
                                                na.rm = TRUE)-diagnosis_date)/365.25),
                               survyrs, na.rm = TRUE)
  )

Function for summarizing non-metastatic rectal cancer baseline data

summary_general <- function(df, group_var) {
  var <- deparse(substitute(group_var))
  Reduce(function(x, y) merge(x, y, by = var, all = TRUE),
         list(df %>% 
                filter(M1_at_Dx == 0) %>% 
                group_by(age_group2, {{group_var}}) %>% 
                summarize(sub_total = n()) %>% 
                pivot_wider(names_from = age_group2, values_from = sub_total) %>% 
                mutate_at(vars(-var), ~replace(., is.na(.), 0)) %>% 
                select(-`>=50`),
              df %>% 
                filter(M1_at_Dx == 0) %>% 
                group_by(age_group1, {{group_var}}) %>% 
                summarize(sub_total = n()) %>% 
                pivot_wider(names_from = age_group1, values_from = sub_total) %>% 
                mutate_at(vars(-var), ~replace(., is.na(.), 0)),
              df %>% 
                filter(M1_at_Dx == 0) %>% 
                group_by({{group_var}}) %>% 
                summarize(`Grand Total` = n()) %>% 
                mutate_at(vars(-var), ~replace(., is.na(.), 0)))
  ) %>% 
    arrange({{group_var}}) %>%
    mutate({{group_var}} := as.character({{group_var}})) %>%
    filter(!is.na({{group_var}})) %>%
    bind_rows(summarize(., across(where(is.numeric), sum),
                        across(where(is.character), ~"Subtotal"))) %>%
    rename(Parameter = {{group_var}}) %>% 
    
    # add percentage
    mutate_at(vars(-Parameter),
              ~if_else((Parameter != "Subtotal") & (.x != 0),
                       paste0(.x, ' (', sprintf("%.1f", round(.x/.x[Parameter == "Subtotal"]*100, 1)), ')'),
                       # paste0(.x, ' (', sprintf("%.1f", round(.x/sum(.x)*100, 1)), ')'),
                       as.character(.x)))
}

Table 1: Baseline characteristics for all non-metastatic rectal cancer cases

Baseline Characteristics <=29 30-39 40-49 <50 >=50 Grand Total
Sex
Female 11 (57.9) 47 (44.3) 158 (38.8) 216 (40.6) 2002 (35.1) 2218 (35.6)
Male 8 (42.1) 59 (55.7) 249 (61.2) 316 (59.4) 3698 (64.9) 4014 (64.4)
Subtotal 19 106 407 532 5700 6232
Geographic Location
Rural 2 (10.5) 11 (10.4) 53 (13.0) 66 (12.4) 838 (14.7) 904 (14.5)
Urban 17 (89.5) 95 (89.6) 352 (86.5) 464 (87.2) 4859 (85.2) 5323 (85.4)
Unknown 0 0 2 (0.5) 2 (0.4) 3 (0.1) 5 (0.1)
Subtotal 19 106 407 532 5700 6232
Tumor Location
Distal (<5 cm) 2 (22.2) 25 (30.9) 95 (29.9) 122 (29.9) 1209 (28.4) 1331 (28.6)
Mid (5-10.9 cm) 5 (55.6) 43 (53.1) 165 (51.9) 213 (52.2) 2339 (55.0) 2552 (54.8)
Proximal (11-15 cm) 2 (22.2) 13 (16.0) 58 (18.2) 73 (17.9) 703 (16.5) 776 (16.7)
Subtotal 9 81 318 408 4251 4659
Tumor Grade
Well-Differentiated (Grades 1 and 2) 14 (77.8) 81 (87.1) 311 (87.1) 406 (86.8) 4456 (88.9) 4862 (88.7)
Poorly Differentiated (Grades 3 and 4) 4 (22.2) 12 (12.9) 46 (12.9) 62 (13.2) 555 (11.1) 617 (11.3)
Subtotal 18 93 357 468 5011 5479
Histology
Adenocarcinoma 18 (94.7) 98 (92.5) 396 (97.3) 512 (96.2) 5475 (96.1) 5987 (96.1)
Mucinous/Signet 1 (5.3) 7 (6.6) 10 (2.5) 18 (3.4) 207 (3.6) 225 (3.6)
Other 0 1 (0.9) 1 (0.2) 2 (0.4) 18 (0.3) 20 (0.3)
Subtotal 19 106 407 532 5700 6232
MSI
MSI stable 4 (66.7) 24 (85.7) 77 (76.2) 105 (77.8) 369 (56.1) 474 (59.8)
MSI unstable, NOS; positive, NOS 2 (33.3) 4 (14.3) 24 (23.8) 30 (22.2) 289 (43.9) 319 (40.2)
Subtotal 6 28 101 135 658 793
Margin
Microscopic Margins Clear 9 (100.0) 72 (86.7) 282 (84.4) 363 (85.2) 3891 (88.6) 4254 (88.3)
Microscopic Margins positive (<=1mm) 0 10 (12.0) 50 (15.0) 60 (14.1) 471 (10.7) 531 (11.0)
Macroscopic Residual Cancer, grossly positive 0 1 (1.2) 2 (0.6) 3 (0.7) 32 (0.7) 35 (0.7)
Subtotal 9 83 334 426 4394 4820
cT
0 0 0 0 0 1 (0.0) 1 (0.0)
1 0 4 (5.6) 3 (1.0) 7 (1.8) 117 (3.0) 124 (2.9)
2 1 (7.1) 7 (9.9) 42 (13.9) 50 (12.9) 622 (15.9) 672 (15.6)
3 11 (78.6) 50 (70.4) 228 (75.5) 289 (74.7) 2812 (71.9) 3101 (72.2)
4 2 (14.3) 10 (14.1) 29 (9.6) 41 (10.6) 357 (9.1) 398 (9.3)
Subtotal 14 71 302 387 3909 4296
cN
0 4 (30.8) 23 (37.1) 117 (42.9) 144 (41.4) 1873 (52.9) 2017 (51.9)
1 6 (46.2) 34 (54.8) 124 (45.4) 164 (47.1) 1409 (39.8) 1573 (40.5)
2 3 (23.1) 5 (8.1) 32 (11.7) 40 (11.5) 258 (7.3) 298 (7.7)
Subtotal 13 62 273 348 3540 3888
Overall Clinical Stage
0 0 0 0 0 6 (0.2) 6 (0.2)
1 0 2 (3.2) 3 (1.1) 5 (1.5) 91 (2.7) 96 (2.6)
2 4 (30.8) 21 (33.3) 108 (40.4) 133 (38.8) 1627 (48.0) 1760 (47.2)
3 9 (69.2) 40 (63.5) 156 (58.4) 205 (59.8) 1664 (49.1) 1869 (50.1)
Subtotal 13 63 267 343 3388 3731
pT
0 1 (6.7) 3 (3.1) 32 (8.4) 36 (7.3) 253 (5.0) 289 (5.2)
1 0 15 (15.3) 38 (9.9) 53 (10.7) 526 (10.3) 579 (10.3)
2 1 (6.7) 22 (22.4) 92 (24.0) 115 (23.2) 1449 (28.4) 1564 (27.9)
3 11 (73.3) 51 (52.0) 193 (50.4) 255 (51.4) 2628 (51.4) 2883 (51.4)
4 2 (13.3) 7 (7.1) 28 (7.3) 37 (7.5) 254 (5.0) 291 (5.2)
Subtotal 15 98 383 496 5110 5606
pN
0 4 (26.7) 48 (49.5) 200 (54.2) 252 (52.4) 2837 (58.4) 3089 (57.8)
1 9 (60.0) 32 (33.0) 108 (29.3) 149 (31.0) 1368 (28.1) 1517 (28.4)
2 2 (13.3) 17 (17.5) 61 (16.5) 80 (16.6) 656 (13.5) 736 (13.8)
Subtotal 15 97 369 481 4861 5342
Overall Pathological Stage
0 1 (6.7) 3 (3.2) 26 (7.1) 30 (6.3) 223 (4.6) 253 (4.8)
1 0 8 (8.4) 19 (5.2) 27 (5.7) 276 (5.7) 303 (5.7)
2 3 (20.0) 44 (46.3) 183 (49.9) 230 (48.2) 2572 (53.5) 2802 (53.0)
3 11 (73.3) 40 (42.1) 139 (37.9) 190 (39.8) 1738 (36.1) 1928 (36.5)
Subtotal 15 95 367 477 4809 5286

Function for summarizing non-metastatic rectal cancer pre-op RT data

summary_pre_op_rt <- function(df, rt_course_filter, group_var) {
  rt_filter <- case_when(rt_course_filter == "short" ~ "Short-course RT",
                         rt_course_filter == "long" ~ "Long-course chemoRT",
                         rt_course_filter == "all" ~ c("Short-course RT","Long-course chemoRT"))
  var <- deparse(substitute(group_var))
  Reduce(function(x, y) merge(x, y, by = var, all = TRUE),
         list(df %>% 
                filter(rt_course %in% rt_filter,
                       M1_at_Dx == 0,
                       neoadjuvant == 1) %>% 
                group_by(age_group2, {{group_var}}, .drop = FALSE) %>% 
                summarize(sub_total = n()) %>% 
                pivot_wider(names_from = age_group2, values_from = sub_total) %>% 
                mutate_at(vars(-var), ~replace(., is.na(.), 0)) %>% 
                select(-`>=50`),
              df %>% 
                filter(rt_course %in% rt_filter,
                       M1_at_Dx == 0,
                       neoadjuvant == 1) %>% 
                group_by(age_group1, {{group_var}}, .drop = FALSE) %>% 
                summarize(sub_total = n()) %>% 
                pivot_wider(names_from = age_group1, values_from = sub_total) %>% 
                mutate_at(vars(-var), ~replace(., is.na(.), 0)),
              df %>% 
                filter(rt_course %in% rt_filter,
                       M1_at_Dx == 0,
                       neoadjuvant == 1) %>% 
                group_by({{group_var}}, .drop = FALSE) %>% 
                summarize(`Grand Total` = n()) %>% 
                mutate_at(vars(-var), ~replace(., is.na(.), 0)))
  ) %>% 
    arrange({{group_var}}) %>%
    mutate({{group_var}} := as.character({{group_var}})) %>%
    filter(!is.na({{group_var}})) %>%
    bind_rows(summarize(., across(where(is.numeric), sum),
                        across(where(is.character), ~"Subtotal"))) %>%
    rename(Parameter = {{group_var}}) %>% 
    
    # add percentage
    mutate_at(vars(-Parameter),
              ~if_else((Parameter != "Subtotal") & (.x != 0),
                       paste0(.x, ' (', sprintf("%.1f", round(.x/.x[Parameter == "Subtotal"]*100, 1)), ')'),
                       # paste0(.x, ' (', sprintf("%.1f", round(.x/sum(.x)*100, 1)), ')'),
                       as.character(.x)))
}

Table 2: Outcome for all non-metastatic rectal cancer cases with neoadjuvant therapy

All RT course <=29 30-39 40-49 <50 >=50 Grand Total
Neoadjuvant Therapy
Short-course RT 1 (25.0) 13 (26.5) 90 (45.9) 104 (41.8) 1439 (61.2) 1543 (59.3)
Long-course chemoRT 3 (75.0) 36 (73.5) 106 (54.1) 145 (58.2) 913 (38.8) 1058 (40.7)
Subtotal 4 49 196 249 2352 2601
cT Stage
1 0 0 0 0 9 (0.4) 9 (0.4)
2 1 (25.0) 7 (14.9) 24 (12.6) 32 (13.2) 295 (13.4) 327 (13.4)
3 1 (25.0) 36 (76.6) 161 (84.3) 198 (81.8) 1783 (80.9) 1981 (81.0)
4 2 (50.0) 4 (8.5) 6 (3.1) 12 (5.0) 117 (5.3) 129 (5.3)
Subtotal 4 47 191 242 2204 2446
cN Stage
0 2 (50.0) 13 (30.2) 66 (38.8) 81 (37.3) 945 (48.4) 1026 (47.3)
1 1 (25.0) 25 (58.1) 83 (48.8) 109 (50.2) 869 (44.5) 978 (45.1)
2 1 (25.0) 5 (11.6) 21 (12.4) 27 (12.4) 138 (7.1) 165 (7.6)
Subtotal 4 43 170 217 1952 2169
Overall Clinical Stage
1 0 0 0 0 7 (0.4) 7 (0.3)
2 2 (50.0) 15 (34.1) 73 (42.9) 90 (41.3) 965 (50.1) 1055 (49.2)
3 2 (50.0) 29 (65.9) 97 (57.1) 128 (58.7) 953 (49.5) 1081 (50.4)
Subtotal 4 44 170 218 1925 2143
ypT Stage
0 1 (25.0) 3 (6.2) 22 (11.4) 26 (10.6) 165 (7.1) 191 (7.4)
1 0 6 (12.5) 10 (5.2) 16 (6.5) 127 (5.5) 143 (5.6)
2 0 8 (16.7) 52 (26.9) 60 (24.5) 685 (29.4) 745 (29.0)
3 3 (75.0) 28 (58.3) 101 (52.3) 132 (53.9) 1263 (54.3) 1395 (54.2)
4 0 3 (6.2) 8 (4.1) 11 (4.5) 87 (3.7) 98 (3.8)
Subtotal 4 48 193 245 2327 2572
ypN Stage
0 1 (25.0) 26 (53.1) 103 (53.4) 130 (52.8) 1451 (62.0) 1581 (61.2)
1 3 (75.0) 13 (26.5) 49 (25.4) 65 (26.4) 619 (26.5) 684 (26.5)
2 0 10 (20.4) 41 (21.2) 51 (20.7) 269 (11.5) 320 (12.4)
Subtotal 4 49 193 246 2339 2585
Overall Pathological Stage
0 1 (25.0) 3 (6.2) 16 (8.3) 20 (8.2) 147 (6.3) 167 (6.5)
1 0 4 (8.3) 8 (4.2) 12 (4.9) 110 (4.7) 122 (4.8)
2 0 19 (39.6) 99 (51.6) 118 (48.4) 1304 (56.2) 1422 (55.5)
3 3 (75.0) 22 (45.8) 69 (35.9) 94 (38.5) 759 (32.7) 853 (33.3)
Subtotal 4 48 192 244 2320 2564
T Stage Change
Downstage 3 (75.0) 18 (39.1) 76 (40.2) 97 (40.6) 836 (38.3) 933 (38.6)
No change 0 23 (50.0) 101 (53.4) 124 (51.9) 1178 (54.0) 1302 (53.8)
Upstage 1 (25.0) 5 (10.9) 12 (6.3) 18 (7.5) 166 (7.6) 184 (7.6)
Subtotal 4 46 189 239 2180 2419
N Stage Change
Downstage 1 (25.0) 16 (37.2) 47 (27.8) 64 (29.6) 575 (29.6) 639 (29.6)
No change 2 (50.0) 18 (41.9) 80 (47.3) 100 (46.3) 974 (50.2) 1074 (49.8)
Upstage 1 (25.0) 9 (20.9) 42 (24.9) 52 (24.1) 391 (20.2) 443 (20.5)
Subtotal 4 43 169 216 1940 2156
Overal Stage Change
Downstage 1 (25.0) 17 (39.5) 68 (40.5) 86 (40.0) 662 (34.8) 748 (35.4)
No change 2 (50.0) 21 (48.8) 81 (48.2) 104 (48.4) 1011 (53.2) 1115 (52.7)
Upstage 1 (25.0) 5 (11.6) 19 (11.3) 25 (11.6) 227 (11.9) 252 (11.9)
Subtotal 4 43 168 215 1900 2115
Total Number of Nodes
<12 0 10 (20.4) 70 (36.1) 80 (32.4) 1000 (43.0) 1080 (42.0)
>=12 4 (100.0) 39 (79.6) 124 (63.9) 167 (67.6) 1325 (57.0) 1492 (58.0)
Subtotal 4 49 194 247 2325 2572
Total Number of Positive Nodes
0 1 (25.0) 27 (55.1) 106 (54.9) 134 (54.5) 1484 (64.1) 1618 (63.2)
1-2 2 (50.0) 8 (16.3) 39 (20.2) 49 (19.9) 476 (20.6) 525 (20.5)
3-4 1 (25.0) 4 (8.2) 20 (10.4) 25 (10.2) 153 (6.6) 178 (7.0)
5-9 0 3 (6.1) 23 (11.9) 26 (10.6) 142 (6.1) 168 (6.6)
10+ 0 7 (14.3) 5 (2.6) 12 (4.9) 60 (2.6) 72 (2.8)
Subtotal 4 49 193 246 2315 2561
pCR Status
No 3 (75.0) 45 (93.8) 176 (91.7) 224 (91.8) 2170 (93.7) 2394 (93.5)
Yes 1 (25.0) 3 (6.2) 16 (8.3) 20 (8.2) 147 (6.3) 167 (6.5)
Subtotal 4 48 192 244 2317 2561

Table 3: Outcome for non-metastatic rectal cancer cases with neoadjuvant therapy and short-course RT

Short-course RT <=29 30-39 40-49 <50 >=50 Grand Total
cT Stage
1 0 0 0 0 8 (0.6) 8 (0.6)
2 1 (100.0) 3 (27.3) 18 (20.7) 22 (22.2) 240 (18.2) 262 (18.5)
3 0 7 (63.6) 69 (79.3) 76 (76.8) 1063 (80.7) 1139 (80.4)
4 0 1 (9.1) 0 1 (1.0) 6 (0.5) 7 (0.5)
Subtotal 1 11 87 99 1317 1416
cN Stage
0 1 (100.0) 4 (44.4) 33 (46.5) 38 (46.9) 690 (61.3) 728 (60.3)
1 0 5 (55.6) 33 (46.5) 38 (46.9) 399 (35.4) 437 (36.2)
2 0 0 5 (7.0) 5 (6.2) 37 (3.3) 42 (3.5)
Subtotal 1 9 71 81 1126 1207
Overall Clinical Stage
1 0 0 0 0 7 (0.6) 7 (0.6)
2 1 (100.0) 4 (44.4) 40 (56.3) 45 (55.6) 718 (65.3) 763 (64.7)
3 0 5 (55.6) 31 (43.7) 36 (44.4) 374 (34.0) 410 (34.7)
Subtotal 1 9 71 81 1099 1180
ypT Stage
0 0 0 2 (2.2) 2 (1.9) 27 (1.9) 29 (1.9)
1 0 2 (15.4) 7 (7.8) 9 (8.7) 73 (5.1) 82 (5.3)
2 0 2 (15.4) 28 (31.1) 30 (28.8) 460 (32.0) 490 (31.8)
3 1 (100.0) 9 (69.2) 48 (53.3) 58 (55.8) 841 (58.6) 899 (58.4)
4 0 0 5 (5.6) 5 (4.8) 35 (2.4) 40 (2.6)
Subtotal 1 13 90 104 1436 1540
ypN Stage
0 0 5 (38.5) 44 (49.4) 49 (47.6) 869 (60.4) 918 (59.6)
1 1 (100.0) 4 (30.8) 19 (21.3) 24 (23.3) 370 (25.7) 394 (25.6)
2 0 4 (30.8) 26 (29.2) 30 (29.1) 199 (13.8) 229 (14.9)
Subtotal 1 13 89 103 1438 1541
Overall Pathological Stage
0 0 0 2 (2.2) 2 (1.9) 24 (1.7) 26 (1.7)
1 0 1 (7.7) 5 (5.6) 6 (5.8) 65 (4.5) 71 (4.6)
2 0 5 (38.5) 45 (50.6) 50 (48.5) 867 (60.4) 917 (59.6)
3 1 (100.0) 7 (53.8) 37 (41.6) 45 (43.7) 479 (33.4) 524 (34.1)
Subtotal 1 13 89 103 1435 1538
T Stage Change
Downstage 0 4 (36.4) 28 (32.2) 32 (32.3) 398 (30.3) 430 (30.4)
No change 0 5 (45.5) 49 (56.3) 54 (54.5) 778 (59.2) 832 (58.9)
Upstage 1 (100.0) 2 (18.2) 10 (11.5) 13 (13.1) 138 (10.5) 151 (10.7)
Subtotal 1 11 87 99 1314 1413
N Stage Change
Downstage 0 2 (22.2) 13 (18.3) 15 (18.5) 210 (18.7) 225 (18.7)
No change 0 4 (44.4) 32 (45.1) 36 (44.4) 628 (55.8) 664 (55.1)
Upstage 1 (100.0) 3 (33.3) 26 (36.6) 30 (37.0) 287 (25.5) 317 (26.3)
Subtotal 1 9 71 81 1125 1206
Overal Stage Change
Downstage 0 4 (44.4) 19 (26.8) 23 (28.4) 251 (22.9) 274 (23.3)
No change 0 3 (33.3) 38 (53.5) 41 (50.6) 654 (59.7) 695 (59.1)
Upstage 1 (100.0) 2 (22.2) 14 (19.7) 17 (21.0) 190 (17.4) 207 (17.6)
Subtotal 1 9 71 81 1095 1176
Total Number of Nodes
<12 0 3 (23.1) 35 (39.3) 38 (36.9) 599 (42.3) 637 (41.9)
>=12 1 (100.0) 10 (76.9) 54 (60.7) 65 (63.1) 818 (57.7) 883 (58.1)
Subtotal 1 13 89 103 1417 1520
Total Number of Positive Nodes
0 0 5 (38.5) 44 (50.0) 49 (48.0) 883 (61.7) 932 (60.8)
1-2 1 (100.0) 2 (15.4) 14 (15.9) 17 (16.7) 292 (20.4) 309 (20.2)
3-4 0 2 (15.4) 11 (12.5) 13 (12.7) 102 (7.1) 115 (7.5)
5-9 0 1 (7.7) 16 (18.2) 17 (16.7) 103 (7.2) 120 (7.8)
10+ 0 3 (23.1) 3 (3.4) 6 (5.9) 50 (3.5) 56 (3.7)
Subtotal 1 13 88 102 1430 1532
pCR Status
No 1 (100.0) 13 (100.0) 87 (97.8) 101 (98.1) 1411 (98.3) 1512 (98.3)
Yes 0 0 2 (2.2) 2 (1.9) 24 (1.7) 26 (1.7)
Subtotal 1 13 89 103 1435 1538

Table 4: Outcome for non-metastatic rectal cancer cases with neoadjuvant therapy and long-course chemoRT

Long-course chemoRT <=29 30-39 40-49 <50 >=50 Grand Total
cT Stage
1 0 0 0 0 1 (0.1) 1 (0.1)
2 0 4 (11.1) 6 (5.8) 10 (7.0) 55 (6.2) 65 (6.3)
3 1 (33.3) 29 (80.6) 92 (88.5) 122 (85.3) 720 (81.2) 842 (81.7)
4 2 (66.7) 3 (8.3) 6 (5.8) 11 (7.7) 111 (12.5) 122 (11.8)
Subtotal 3 36 104 143 887 1030
cN Stage
0 1 (33.3) 9 (26.5) 33 (33.3) 43 (31.6) 255 (30.9) 298 (31.0)
1 1 (33.3) 20 (58.8) 50 (50.5) 71 (52.2) 470 (56.9) 541 (56.2)
2 1 (33.3) 5 (14.7) 16 (16.2) 22 (16.2) 101 (12.2) 123 (12.8)
Subtotal 3 34 99 136 826 962
Overall Clinical Stage
2 1 (33.3) 11 (31.4) 33 (33.3) 45 (32.8) 247 (29.9) 292 (30.3)
3 2 (66.7) 24 (68.6) 66 (66.7) 92 (67.2) 579 (70.1) 671 (69.7)
Subtotal 3 35 99 137 826 963
ypT Stage
0 1 (33.3) 3 (8.6) 20 (19.4) 24 (17.0) 138 (15.5) 162 (15.7)
1 0 4 (11.4) 3 (2.9) 7 (5.0) 54 (6.1) 61 (5.9)
2 0 6 (17.1) 24 (23.3) 30 (21.3) 225 (25.3) 255 (24.7)
3 2 (66.7) 19 (54.3) 53 (51.5) 74 (52.5) 422 (47.4) 496 (48.1)
4 0 3 (8.6) 3 (2.9) 6 (4.3) 52 (5.8) 58 (5.6)
Subtotal 3 35 103 141 891 1032
ypN Stage
0 1 (33.3) 21 (58.3) 59 (56.7) 81 (56.6) 582 (64.6) 663 (63.5)
1 2 (66.7) 9 (25.0) 30 (28.8) 41 (28.7) 249 (27.6) 290 (27.8)
2 0 6 (16.7) 15 (14.4) 21 (14.7) 70 (7.8) 91 (8.7)
Subtotal 3 36 104 143 901 1044
Overall Pathological Stage
0 1 (33.3) 3 (8.6) 14 (13.6) 18 (12.8) 123 (13.9) 141 (13.7)
1 0 3 (8.6) 3 (2.9) 6 (4.3) 45 (5.1) 51 (5.0)
2 0 14 (40.0) 54 (52.4) 68 (48.2) 437 (49.4) 505 (49.2)
3 2 (66.7) 15 (42.9) 32 (31.1) 49 (34.8) 280 (31.6) 329 (32.1)
Subtotal 3 35 103 141 885 1026
T Stage Change
Downstage 3 (100.0) 14 (40.0) 48 (47.1) 65 (46.4) 438 (50.6) 503 (50.0)
No change 0 18 (51.4) 52 (51.0) 70 (50.0) 400 (46.2) 470 (46.7)
Upstage 0 3 (8.6) 2 (2.0) 5 (3.6) 28 (3.2) 33 (3.3)
Subtotal 3 35 102 140 866 1006
N Stage Change
Downstage 1 (33.3) 14 (41.2) 34 (34.7) 49 (36.3) 365 (44.8) 414 (43.6)
No change 2 (66.7) 14 (41.2) 48 (49.0) 64 (47.4) 346 (42.5) 410 (43.2)
Upstage 0 6 (17.6) 16 (16.3) 22 (16.3) 104 (12.8) 126 (13.3)
Subtotal 3 34 98 135 815 950
Overal Stage Change
Downstage 1 (33.3) 13 (38.2) 49 (50.5) 63 (47.0) 411 (51.1) 474 (50.5)
No change 2 (66.7) 18 (52.9) 43 (44.3) 63 (47.0) 357 (44.3) 420 (44.7)
Upstage 0 3 (8.8) 5 (5.2) 8 (6.0) 37 (4.6) 45 (4.8)
Subtotal 3 34 97 134 805 939
Total Number of Nodes
<12 0 7 (19.4) 35 (33.3) 42 (29.2) 401 (44.2) 443 (42.1)
>=12 3 (100.0) 29 (80.6) 70 (66.7) 102 (70.8) 507 (55.8) 609 (57.9)
Subtotal 3 36 105 144 908 1052
Total Number of Positive Nodes
0 1 (33.3) 22 (61.1) 62 (59.0) 85 (59.0) 601 (67.9) 686 (66.7)
1-2 1 (33.3) 6 (16.7) 25 (23.8) 32 (22.2) 184 (20.8) 216 (21.0)
3-4 1 (33.3) 2 (5.6) 9 (8.6) 12 (8.3) 51 (5.8) 63 (6.1)
5-9 0 2 (5.6) 7 (6.7) 9 (6.2) 39 (4.4) 48 (4.7)
10+ 0 4 (11.1) 2 (1.9) 6 (4.2) 10 (1.1) 16 (1.6)
Subtotal 3 36 105 144 885 1029
pCR Status
No 2 (66.7) 32 (91.4) 89 (86.4) 123 (87.2) 759 (86.1) 882 (86.2)
Yes 1 (33.3) 3 (8.6) 14 (13.6) 18 (12.8) 123 (13.9) 141 (13.8)
Subtotal 3 35 103 141 882 1023

Figure 1: Clinical and pathological staging for all non-metastatic rectal cancer cases

figure1_data <- data_filtered %>% 
  filter(M1_at_Dx == 0) %>% 
  select(cT, cN, overall_clin_stg, pT, pN, overall_path_stg, age_group1, age_group2) %>% 
  rename(`Overall Clinical` = overall_clin_stg,
         `Overall Pathological` = overall_path_stg) %>% 
  mutate_all(as.factor)

figure1_color <- c("#e0f3f8", "#74add1", "#fdae61", "#f46d43", "#a50026")

Panel A: Young age groups

figure1_panelA_plot <- function(df, var_name) {
  df %>% 
    filter(!is.na({{var_name}})) %>% 
    group_by({{var_name}}, age_group2, .drop = FALSE) %>%
    summarize(n = n()) %>% 
    group_by(age_group2) %>% 
    mutate(n_total = sum(n),
           percent = round(n / n_total *100, 2)) %>% 
    ungroup() %>% 
    bind_cols(binom.confint(.$n, .$n_total, conf.level = 0.95, methods = "exact")[c("lower", "upper")]*100) %>% 
    ggplot(aes(age_group2, percent, group = {{var_name}})) +
    geom_point(aes(color = {{var_name}}), size = 2.5) +
    geom_line(aes(color = {{var_name}}), size = 1) +
    geom_ribbon(aes(ymin = lower, ymax = upper, fill = {{var_name}}), alpha = 0.1) +
    scale_color_manual(values = figure1_color,
                       name = str_wrap(paste0(deparse(substitute(var_name)), " Stage"), 10)) +
    scale_fill_manual(values = figure1_color,
                      name = str_wrap(paste0(deparse(substitute(var_name)), " Stage"), 10)) +
    scale_x_discrete(labels = c("<=29" = "\u226429",
                                ">=50" = "\u226550")) +
    labs(x = "Age Group",
         y = "Percentage (%)",
         title = str_wrap(paste0("Distribution of ", deparse(substitute(var_name)), " Stages"), 45)) +
    ylim(0, 100) +
    theme_light(base_size = 18) +
    theme(axis.text = element_text(size = 18))
}

Panel B: All age groups

figure1_panelB_plot_side <- function(df, var_name) {
  df %>% 
    filter(!is.na({{var_name}})) %>% 
    group_by({{var_name}}, age_group1) %>%
    summarize(n = n()) %>% 
    group_by(age_group1) %>% 
    mutate(n_total = sum(n),
           percent = round(n / n_total *100, 2)) %>% 
    ggplot(aes(age_group1, percent, fill = {{var_name}})) +
    geom_bar(stat = "identity", position = "dodge") +
    geom_text(aes(label = paste0(sprintf("%.1f", percent), "%\n(", n, ")")),
              position = position_dodge(width = 0.9), vjust = -0.25,
              color = "black", size = 3.5) +
    scale_fill_manual(values = figure1_color,
                      name = str_wrap(paste0(deparse(substitute(var_name)), " Stage"), 10)) +
    scale_x_discrete(labels = c(">=50" = "\u226550")) +
    labs(x = "Age Group",
         y = "Percentage (%)",
         title = str_wrap(paste0("Distribution of ", deparse(substitute(var_name)), " Stages"), 45)) +
    theme_light(base_size = 18) +
    theme(axis.text = element_text(size = 18)) +
    ylim(0, 100)
}

Panel B: All age groups (Alternative plots)

figure1_panelB_plot_stack <- function(df, var_name) {
  df %>% 
    filter(!is.na({{var_name}})) %>% 
    group_by({{var_name}}, age_group1) %>%
    summarize(n = n()) %>% 
    group_by(age_group1) %>% 
    mutate(n_total = sum(n),
           percent = round(n / n_total *100, 2),
           y_pos_left = if_else(age_group1 == "<50", 100 - (cumsum(percent) - percent/2), NA_real_),
           y_pos_right = if_else(age_group1 == ">=50", 100 - (cumsum(percent) - percent/2), NA_real_)) %>% 
    ggplot(aes(age_group1, percent, fill = {{var_name}})) +
    geom_bar(stat = "identity", position = "stack", width = 0.3) +
    geom_label_repel(aes(x = 0.85, y = y_pos_left,
                         label = paste0(sprintf("%.1f", percent), "%\n(", n, ")")),
                     nudge_x = -0.4, direction = "y", box.padding = 0.5, 
                     color = "black", alpha = 0.8, size = 3.5, fontface = "bold", 
                     show.legend = FALSE) +
    geom_label_repel(aes(x = 2.15, y = y_pos_right,
                         label = paste0(sprintf("%.1f", percent), "%\n(", n, ")")),
                     nudge_x = 0.4, direction = "y", box.padding = 0.5, 
                     color = "black", alpha = 0.8, size = 3.5, fontface = "bold", 
                     show.legend = FALSE) +
    scale_fill_manual(values = figure1_color,
                      name = str_wrap(paste0(deparse(substitute(var_name)), " Stage"), 10)) +
    scale_x_discrete(labels = c(">=50" = "\u226550")) +
    labs(x = "Age Group",
         y = "Percentage (%)",
         title = str_wrap(paste0("Distribution of ", deparse(substitute(var_name)), " Stages"), 45)) +
    theme_light(base_size = 18) +
    theme(axis.text = element_text(size = 18))
}

Figure 2: Effectiveness of long-course ChemoRT treatment among all non-metastatic rectal cancer cases

figure2_data <- data_filtered %>% 
  filter(M1_at_Dx == 0,
         neoadjuvant == 1) %>% 
  select(cT, cN, overall_clin_stg, pT, pN, overall_path_stg,
         T_stg_change_status, N_stg_change_status, overall_stg_change_status,
         rt_course, pCR, age_group1, age_group2) %>% 
  mutate(overall_stg_change_status = case_when(pCR == "Yes" ~ "Complete",
                                               TRUE ~ as.character(overall_stg_change_status))) %>% 
  mutate_all(as.factor) %>% 
  rename(`Overall Clinical` = overall_clin_stg,
         `Overall Pathological` = overall_path_stg)

Panel A: Treatment types

figure2_panelA_color <- c("#abd9e9", "#fdae61")

figure2_panelA_plot <- function(df, age_group_var) {
  df %>% 
    filter(!is.na(rt_course), rt_course != "Long-course RT") %>% 
    group_by(rt_course, {{age_group_var}}) %>%
    summarize(n = n()) %>% 
    group_by({{age_group_var}}) %>% 
    mutate(n_total = sum(n),
           percent = round(n / n_total * 100, 1)) %>% 
    ggplot(aes(x = factor(1), y = percent, fill = rt_course)) +
    geom_bar(stat = "identity", color = "white") +
    coord_polar("y", start = 0) +
    facet_grid(cols = vars({{age_group_var}}), labeller = as_labeller(c("<50" = "Age group: <50",
                                                                        ">=50" = "Age group: \u226550",
                                                                        "<=29" = "Age group: \u226429",
                                                                        "30-39" = "30-39",
                                                                        "40-49" = "40-49"))) +
    geom_text(aes(label = paste0(n,"\n(", sprintf("%.1f", percent), "%)")),
              position = position_stack(vjust = 0.5),
              color = "black", size = 3.5) +
    scale_fill_manual(values = figure2_panelA_color,
                      name = "Neoadjuvant Treatment") +
    theme_void(base_size = 14) +
    theme(strip.text = element_text(margin = margin(b = 5)))
}

Panel B: Stage change

figure2_panelB_color <- c("#313695", "#abd9e9", "#f46d43", "#a50026")

figure2_panelB_plot <- function(df, age_group_var) {
  df %>% 
    filter(rt_course == "Long-course chemoRT") %>% 
    group_by({{age_group_var}}, `Overall Clinical`, `Overall Pathological`, overall_stg_change_status) %>%
    drop_na() %>% 
    summarize(n = n()) %>% 
    group_by({{age_group_var}}) %>% 
    mutate(n_total = sum(n),
           percent = round(n / n_total * 100, 1)) %>% 
    ggplot(aes(y = percent, axis1 = `Overall Clinical`, axis2 = `Overall Pathological`)) +
    coord_cartesian(clip = "off") +
    geom_alluvium(aes(fill = overall_stg_change_status),
                  width = 1/12, reverse =FALSE) +
    geom_stratum(width = 0.1, reverse =FALSE) +
    geom_label(stat = "stratum", aes(label = after_stat(stratum)),
               size = 4, fontface = "bold",
               label.size = NA, fill = NA, reverse =FALSE) +
    facet_wrap(vars({{age_group_var}}), labeller = as_labeller(c("<50" = "Age group: <50",
                                                                 ">=50" = "Age group: \u226550")),
               scales = "free") +
    scale_x_discrete(limits = c("Overall\nClinical\nStage", "Overall\nPathological\nStage"),
                     expand = c(0.01, 0.01)) +
    scale_fill_manual(values = figure2_panelB_color,
                      name = "Stage Change") +
    labs(title = str_wrap(paste0("Overall Stage Change"), 45)) +
    theme_void(base_size = 12) +
    theme(axis.text.x = element_text(size = 12),
          plot.margin = margin(rep(25,10)),
          aspect.ratio = 1.2,
          panel.spacing = unit(3, "lines"),
          strip.text = element_text(margin = margin(b = 5)))
}

Panel C: pCR rate

figure2_panelC_color <- c("#fdae61", "#abd9e9")

figure2_panelC_plot <- function(df, age_group_var) {
  temp_df <- df %>% 
    filter(!is.na(pCR), rt_course == "Long-course chemoRT") %>% 
    group_by({{age_group_var}}, pCR) %>%
    summarize(n = n()) %>% 
    group_by({{age_group_var}}) %>%
    mutate(n_total = sum(n),
           percent = round(n / n_total *100, 2)) %>%
    ungroup() %>%
    bind_cols(binom.confint(.$n, .$n_total, conf.level = 0.95, methods = "exact")[c("lower", "upper")]*100)
  
  n_total_label <- temp_df %>% 
    select({{age_group_var}}, n_total) %>% 
    unique()

  temp_df %>% 
    ggplot(aes(x = {{age_group_var}}, y = percent, fill = pCR)) +
    geom_bar(stat = "identity", position = "dodge") +
    geom_errorbar(aes(ymin = lower, ymax = upper), width = .2, position = position_dodge(0.9)) +
    scale_fill_manual(values = figure2_panelC_color,
                      name = "pCR") +
    labs(x = "Age Group",
         y = "pCR Rate (%)",
         title = str_wrap("pCR rate in different age groups", 45)) +
    scale_x_discrete(labels = c("<=29" = paste0("\u226429 \n(n=", filter(n_total_label, {{age_group_var}} == "<=29")$n_total, ")"),
                                "30-39" = paste0("30-39 \n(n=", filter(n_total_label, {{age_group_var}} == "30-39")$n_total, ")"),
                                "40-49" = paste0("40-49 \n(n=", filter(n_total_label, {{age_group_var}} == "40-49")$n_total, ")"),
                                "<50" = paste0("<50 \n(n=", filter(n_total_label, {{age_group_var}} == "<50")$n_total, ")"),
                                ">=50" = paste0("\u226550 \n(n=", filter(n_total_label, {{age_group_var}} == ">=50")$n_total, ")"))) +
    ylim(0, 100) +
    theme_light(base_size = 18) +
    theme(axis.text = element_text(size = 18),
          strip.text = element_text(margin = margin(b = 5)))
}

Figure 3: Survival analysis for all rectal cancer cases

figure3_data <- data_filtered %>% 
  select(age_group1, age_group2, diagnosis_date, M1_at_Dx, neoadjuvant, rt_course,
         survyrs, pat_status, surv_status, death_date,
         colorectaldeath, colorectaldthdat, crc_death_status,
         m1dtofdx, m1site, fst_relapse_date, fst_reg_dist_date,
         LOCIND, LOCDATE, REGIND, REGDATE, DISTIND, DISTDATE,
         subseq_colorectalca, subseq_CRC_dx_date,
         recurr_status, dis_free_yrs, crc_recurr_status, crc_recurr_free_yrs)
figure3_plot_maxyear <- function(df, time_var, status_var, age_group_var, title_text, break_yr) {
  fit <- eval(substitute(survfit(Surv(time_var, status_var) ~ age_group_var, data = df)))
  figure3_color <- case_when(length(fit[["strata"]]) == 2 ~ list(c("#0571b0", "#ca0020")),
                             length(fit[["strata"]]) == 3 ~ list(c("#abd9e9", "#fdae61", "#f46d43")),
                             length(fit[["strata"]]) == 4 ~ list(c("#abd9e9", "#fdae61", "#f46d43", "#a50026")))[[1]]
  legend_text <- if_else(deparse(substitute(age_group_var)) == "age_group1", list(c("<50", "\u226550")),
                         list(c("\u226429", "30-39", "40-49", "\u226550")))[[1]]
  ggsurvplot(fit,
             palette = figure3_color,
             pval = TRUE,
             conf.int = FALSE,
             risk.table = TRUE,
             tables.height = 0.3,
             risk.table.fontsize = 3.5,
             break.time.by = break_yr,
             surv.median.line = "hv",
             xlab = "Time (year)",
             ylab = "Survival Probability",
             title = str_wrap(title_text, 62),
             legend.title = "Age Group",
             legend.labs = legend_text[1:length(figure3_color)])
}
figure3_plot_month <- function(df, time_var, status_var, age_group_var, title_text, break_yr, max_yr) {
  # df <- df %>%
  #   mutate({{status_var}} := case_when({{time_var}} > max_yr ~ 0,
  #                                      TRUE ~ {{status_var}}),
  #          {{time_var}} := case_when({{time_var}} > max_yr ~ max_yr,
  #                                      TRUE ~ {{time_var}}))
  fit <- eval(substitute(survfit(Surv(time_var, status_var) ~ age_group_var, data = df)))
  figure3_color <- case_when(length(fit[["strata"]]) == 2 ~ list(c("#0571b0", "#ca0020")),
                             length(fit[["strata"]]) == 3 ~ list(c("#abd9e9", "#fdae61", "#f46d43")),
                             length(fit[["strata"]]) == 4 ~ list(c("#abd9e9", "#fdae61", "#f46d43", "#a50026")))[[1]]
  legend_text <- if_else(deparse(substitute(age_group_var)) == "age_group1", list(c("<50", "\u226550")),
                         list(c("\u226429", "30-39", "40-49", "\u226550")))[[1]]
  ggsurvplot(fit,
             palette = figure3_color,
             xscale = "y_m",
             pval = TRUE,
             conf.int = FALSE,
             risk.table = TRUE,
             tables.height = 0.3,
             risk.table.fontsize = 3.5,
             break.time.by = break_yr,
             surv.median.line = "hv",
             xlab = "Time (month)",
             ylab = "Survival Probability",
             title = str_wrap(title_text, 62),
             legend.title = "Age Group",
             legend.labs = legend_text[1:length(figure3_color)],
             xlim = c(0, max_yr))
}

Panel A: Overall survival












Panel B: Disease specific survival












Panel C: Disease free survival





Figure 4: Cox regression analysis

figure4_data <- data_filtered %>% 
  filter(M1_at_Dx == 0, 
         rt_course != "Long-course RT" | is.na(rt_course),
         !(rt_merged %in% c(5, 6, 77, 88, NA)),
         !is.na(loc_merged),
         !is.na(overall_path_stg)) %>% 
  droplevels() %>% 
  select(age_group1, age_group2, sex, loc_merged, overall_clin_stg,
         overall_path_stg, neoadjuvant, rt_merged, rt_course,
         survyrs, pat_status, surv_status, colorectaldeath, crc_death_status,
         LOCIND, REGIND, DISTIND, subseq_colorectalca, recurr_status,
         dis_free_yrs, crc_recurr_status) %>% 
  mutate(
    across(c(overall_clin_stg, overall_path_stg), ~as.factor(.x)),
    Age = factor(age_group1, labels = c("<50", "50+")),
    rt_course = as.character(rt_course),
    rt_course = if_else(rt_merged %in% c(0, 3, 4), "No", rt_course) %>% 
      factor(levels = c("Long-course chemoRT", "Short-course RT", "No"))) %>% 
  filter(!(neoadjuvant == 0 & rt_course %in% c("Long-course chemoRT", "Short-course RT"))) %>% 
  rename(Sex = sex,
         `Tumour Location` = loc_merged,
         `Overall Pathological Stage` = overall_path_stg,
         `Neoadjuvant Therapy` = rt_course)
figure4_plot <- function(df, time_var, status_var, title_text) {
  fit_ph <- eval(substitute(coxph(Surv(time_var, status_var) ~
                                    Sex + Age + `Tumour Location` +
                                    `Overall Pathological Stage` +
                                    `Neoadjuvant Therapy`,
                                  data = as.data.frame(df))))
  ggforest(fit_ph,
           main = title_text,
           cpositions = c(0.005, 0.21, 0.4),
           fontsize = 0.6,
           refLabel = "Reference")
}